#-----------------------------------#
# Replication of:
# Seeking Opportunity in the Knowledge Economy: Moving Places, Moving Politics?
# Valentina Consiglio, Thomas Kurer

# File: 1_opportunity_index

# (1) Create Opportunity Index
# This file creates our multidimensional Opportunity Index at the Kreis-level (NUTS-3)
# along with Figures/Tables for visualization, validation and sensitivity

# Our index relies on a total of 17 indicators
  # 11 of those indicators are publicly available
  # 6 of those indicators are proprietary and may be requested from Prognos (www.prognos.com)

# IMPORTANT NOTE:
# This code reproduces our opportunity index *if applied to the full set of input indicators*.
# Since the full input data cannot be shared, the code will also run on the publicly provided reduced set,
# but this will NOT reproduce the exact index values used in our analysis.
# However, the index values based on the public indicator subset are substantively very similar.
# This code is provided for transparency and methodological clarity only.
# To work with the actual index values used in our study, please load the file:
# "opportunity_KE_by_krs.csv"
# All figures and tables in the main body of our manuscript can be reproduced based on that file.
#-----------------------------------#


# PREPARATION ----

# clear console
cat("\014")  
# numbers in scientific notation
options(scipen=999)

set.seed(1115)

## Packages ----

if (!require("pacman")) install.packages("pacman") 

pacman::p_load(tidyverse, magrittr, devtools, broom, devtools, boot, factoextra, remotes, hrbrthemes, plm, estimatr, sandwich, lmtest, AER, lfe, huxtable, margins, readstata13, texreg, reshape2, readxl, xtable, sf, haven, ggrepel, ggxmean, ggforce, ggbiplot, viridis)

install_github("vqv/ggbiplot")
library(ggbiplot)

## Functions ----

normalize01 <- function(x, ...){(x - min(x, ...)) / (max(x, ...) - min(x, ...))}

# LOAD DATA ----

# load Kreis-level data set collected from various sources (see Table A.1)
# remember that the publicly provided data set contains only 11 out of 17 indicators
# to fully replicate our index values, the complete set of indicators is needed

if (file.exists("data_original/indicators_complete.RData")) {
  load("data_original/indicators_complete.RData")
  df <- indicators_complete
} else {
  load("data_original/indicators_public.RData")
  df <- indicators_public
}

# INDEX CREATION ----

## Select set of variables for index in main analysis ----

opm_vars <- df %>% 
  dplyr::select(# meta data
         bundesl_code, Kommune, # NUTS-2 code, NUTS-3 name
         urbanity3, # rural-urban
         
         # index indicators
         nr_jobs_sc, share_high_skill, # jobs
         wpcentr, starts_with("pop_dynamic"), logpop, # population and influx, 
         patent_pc, # intangible assets 
         starts_with("pendel"), # mobility
         broadband, # connectivity
         starts_with("immopreis_inc_relation"), # housing, proprietary
         artists_pc,  nightclubs_pc, theaters_pc, # cultural amenities
         starts_with("pupils_per_teacher"), # other amenities, proprietary
         playgrounds_pc, # other amenities
         starts_with("vereine_pc"), # social capital, proprietary
         starts_with("oenv"), # public transport, proprietary
         service_provision # provision of basic services
         ) %>%
  dplyr::select(-contains("proprietary")) %>% # nothing is omitted with indicators_complete, proprietary data is omitted with indicators_public
  na.omit() # complete data necessary for PCA

## Principal Component Analysis ----

opm.pca <- prcomp(opm_vars[,4:length(opm_vars)], center = TRUE, scale. = TRUE)

# Scree plot
fviz_eig(opm.pca, addlabels = TRUE, ylim = c(0, 100))

# Note: sign indeterminacy of PCA, direction of component can vary between R versions
# here we manually flip the sign of PC2 so that both components point in the same (negative) direction
opm.pca$rotation[,2] <- -opm.pca$rotation[,2]
opm.pca$x[,2] <- -opm.pca$x[,2]

# Scree plot
fviz_eig(opm.pca, addlabels = TRUE, ylim = c(0, 100))
# changing direction does not affect components

### Plot first two components ----

ggbiplot(opm.pca)
ggsave("figures/Figure_1a.png", width=6, height=5, dpi = 600, units = "in")

ggbiplot(opm.pca, labels=opm_vars$Kommune)
ggsave("figures/Figure_1b.png", width=6, height=5, dpi = 600, units = "in")

# Components by urbanity (Appendix)
ggbiplot(opm.pca, ellipse=TRUE, groups=factor(opm_vars$urbanity3)) + 
  scale_color_manual("Population Density",values=c("cadetblue1","cornflowerblue", "darkblue"), labels=c("low", "mid", "high"))
ggsave("figures/Appendix_Figure_B1.png", width=6, height=5, dpi = 600, units = "in")

## Create weighted index ----

df$pc1 <- scale(opm.pca$x[,1]) * (-1) # recode so that higher values are positive
df$pc2 <- scale(opm.pca$x[,2]) * (-1) # recode so that higher values are positive

df$pc1_01 <- normalize01(df$pc1)
df$pc2_01 <- normalize01(df$pc2)

# extract explained var
df$prop_var1 <- (opm.pca$sdev^2 / sum(opm.pca$sdev^2))[1]
df$prop_var2 <- (opm.pca$sdev^2 / sum(opm.pca$sdev^2))[2]

# create weights based on relative explained var
df$pc_w1 <- df$prop_var1/(df$prop_var1+df$prop_var2)
df$pc_w2 <- df$prop_var2/(df$prop_var1+df$prop_var2)

### index = weighted sum of first two components ----

df$opportunity_KE_pc <- (df$pc1*df$pc_w1) + (df$pc2*df$pc_w2) 

# scaled to 0 - 1
df$opportunity_KE_pc01 <- normalize01(df$opportunity_KE_pc)

### create adjustment for commuting ----

most_relevant_commute <- df %>% dplyr::select(krs, krs_max_pendel, maxpendel_sc) # kreis code, most relevant commute kreis code, per capita commuters to most relevant commute kreis

most_relevant_commute <- merge(most_relevant_commute, df[, c("krs", "opportunity_KE_pc01", "pc1_01", "pc2_01")], by.x="krs_max_pendel", by.y="krs") # merge most relevant commute kreis opportunity
most_relevant_commute <- most_relevant_commute %>% dplyr::arrange(krs) %>% dplyr::select(-krs_max_pendel) %>%
  dplyr::rename(commute_weight=maxpendel_sc, # rename to reflect that characteristics are from most relevant commute kreis
                commute_opportunity_KE_pc01=opportunity_KE_pc01,
                commute_pc1_01=pc1_01,
                commute_pc2_01=pc2_01)

df <- merge(df, most_relevant_commute, by="krs", all.x=TRUE) # merge most relevant commute kreis characteristics back to original df

# index adjusted for commuting kreis characteristics, weighted by share of population that commutes to that kreis
df$opportunity_KE_pc01_wn <- df$opportunity_KE_pc01 + (df$commute_opportunity_KE_pc01 * df$commute_weight)

# same weighting procedure for individual components
df$pc1_01_wn <- df$pc1_01 + (df$commute_pc1_01 * df$commute_weight)
df$pc2_01_wn <- df$pc2_01 + (df$commute_pc2_01 * df$commute_weight)

# ranked opportunity in addition to index value

df %<>% mutate(
  rank_KE_total = dense_rank(desc(opportunity_KE_pc01)),
  rank_KE_total_wn = dense_rank(desc(opportunity_KE_pc01_wn)),
  rank_KE_work = dense_rank(desc(pc1_01)),
  rank_KE_work_wn = dense_rank(desc(pc1_01_wn)),
  rank_KE_amenity = dense_rank(desc(pc2_01)),
  rank_KE_amenity_wn = dense_rank(desc(pc2_01_wn)),
)

### Top and Bottom 10 ----

top_opc <- df %>% select(Kommune, opportunity_KE_pc01 ) %>% slice_max(opportunity_KE_pc01 , n=10)
bottom_opc <- df %>% select(Kommune, opportunity_KE_pc01 ) %>% slice_min(opportunity_KE_pc01 , n=10)

topbottom_opc <- cbind(top_opc, bottom_opc)
names(topbottom_opc) <- c("Place", "Top-10", "Place", "Bottom-10")
xtable(topbottom_opc)

top_opc_wn <- df %>% select(Kommune, opportunity_KE_pc01_wn) %>% slice_max(opportunity_KE_pc01_wn, n=10)
bottom_opc_wn <- df %>% select(Kommune, opportunity_KE_pc01_wn) %>% slice_min(opportunity_KE_pc01_wn, n=10)

topbottom_opc_wn <- cbind(top_opc_wn, bottom_opc_wn)
names(topbottom_opc_wn) <- c("Place", "Top-10", "Place", "Bottom-10")

topbottom10_table <- xtable(topbottom_opc_wn)

# Export Table 1
print(topbottom10_table, file = "tables/Table_2.tex")

### Maps ----

# load shape file
shape <- sf::st_read("data_original/shapes/vg2500_krs.shp", stringsAsFactors=FALSE)

shape$region_code <- as.numeric(shape$RS)

# Fix Goettingen merger 2016
shape_goettingen <- shape %>% filter(region_code== 3156 | region_code==3152)
shape_goettingen %<>% 
  summarize(geometry = sf::st_union(geometry))

shape$region_code[shape$region_code==3152] <- 3159
shape$geometry[shape$region_code==3159] <- shape_goettingen$geometry
shape %<>% filter(region_code!=3156)

# merge opportunity and shape
opportunity <- df %>% dplyr::select(region_code, Kommune, opportunity_KE_pc01, opportunity_KE_pc01_wn, pc1_01, pc1_01_wn, pc2_01, pc2_01_wn)
shape_opportunity <- merge(shape, opportunity, by="region_code", all.x=TRUE)

# Main Map

opportunity_map <- ggplot(data = shape_opportunity) +
  geom_sf() +
  geom_sf(data = shape_opportunity, aes(fill = opportunity_KE_pc01_wn), lwd=0, colour="white") +
  scale_fill_viridis() + 
  theme(legend.position="bottom", legend.title=element_blank())

ggsave(file="figures/Figure_2.png", plot=opportunity_map, height=10, width=10, dpi = 600, units = "in")

# Map by component (Labor Market, Amenity) separately 

opportunity_map_pc1 <- ggplot(data = shape_opportunity) +
  geom_sf() +
  geom_sf(data = shape_opportunity, aes(fill = pc1_01_wn), lwd=0, colour="white") +
  scale_fill_viridis() + 
  theme(legend.position="bottom", legend.title=element_blank())

ggsave(file="figures/Appendix_Figure_A1.png", plot=opportunity_map_pc1, height=10, width=10, dpi = 600, units = "in")

opportunity_map_pc2 <- ggplot(data = shape_opportunity) +
  geom_sf() +
  geom_sf(data = shape_opportunity, aes(fill = pc2_01_wn), lwd=0, colour="white") +
  scale_fill_viridis() + 
  theme(legend.position="bottom", legend.title=element_blank())

ggsave(file="figures/Appendix_Figure_A2.png", plot=opportunity_map_pc2, height=10, width=10, dpi = 600, units = "in")

# Blended components

# Normalize components to 0-1 range
shape_opportunity <- shape_opportunity %>%
  mutate(
    pc1_01_blend = (pc1_01_wn - min(pc1_01_wn, na.rm = TRUE)) / (max(pc1_01_wn, na.rm = TRUE) - min(pc1_01_wn, na.rm = TRUE)),
    pc2_01_blend = (pc2_01_wn - min(pc2_01_wn, na.rm = TRUE)) / (max(pc2_01_wn, na.rm = TRUE) - min(pc2_01_wn, na.rm = TRUE)),
    color = rgb(pc1_01_blend, 0, pc2_01_blend) # Create blended colors
  )

# blended map
opportunity_map_blended <- ggplot(data = shape_opportunity) +
  geom_sf(aes(fill = color), lwd = 0, colour = "white") +
  scale_fill_identity() +
  theme_minimal() +
  theme(legend.position = "none") + # No default legend
  coord_sf()

ggsave(file="figures/Appendix_Figure_A3.png", plot=opportunity_map_blended, height=10, width=10, dpi = 600, units = "in")


# blending legend
legend_data <- expand.grid(
  pc1_01_blend = c(0, 0.5, 1),  # Levels for Component 1 (Red)
  pc2_01_blend = c(0, 0.5, 1)   # Levels for Component 2 (Blue)
)

# Add blended colors to the legend data
legend_data <- legend_data %>%
  mutate(color = rgb(pc1_01_blend, 0, pc2_01_blend))

# Create the legend plot
opportunity_map_blended_legend <- ggplot(legend_data, aes(x = pc1_01_blend, y = pc2_01_blend, fill = color)) +
  geom_tile() +
  scale_fill_identity() +
  theme_minimal() +
  labs(x = "Labor Market", y = "Amenities", title = "") +
  scale_x_continuous(breaks = c(0, 0.5, 1), labels = c("Low", "Medium", "High")) +
  scale_y_continuous(breaks = c(0, 0.5, 1), labels = c("Low", "Medium", "High")) +
  theme(
    axis.title = element_text(size = 36),
    axis.text = element_text(size = 28),
    panel.grid = element_blank(),
    axis.ticks = element_blank()
  )

ggsave("figures/Appendix_Figure_A3_legend.png", plot=opportunity_map_blended_legend, height=10, width=8, dpi = 600, units = "in")

## Export opportunity index for replication ----

opportunity_KE_by_krs <- df %>% arrange(rank_KE_total_wn) %>% dplyr::select(Kommune, krs, contains("rank"), opportunity_KE_pc01, opportunity_KE_pc01_wn, pc1_01, pc1_01_wn, pc2_01, pc2_01_wn)
opportunity_KE_by_krs

opportunity_KE_by_krs %<>% dplyr::rename(
  pc1_KE_01 = pc1_01,
  pc1_KE_01_wn = pc1_01_wn,
  pc2_KE_01 = pc2_01,
  pc2_KE_01_wn = pc2_01_wn
  
)

write.csv(opportunity_KE_by_krs, file= "data_created/opportunity_KE_by_krs.csv")

## Illustration: Index characteristics ----

### Variation within similar-sized kreise ----

# differentiate three bins of population size
df$pop3 <- NA
df$pop3[df$population.bbsr<100000] <- 1
df$pop3[df$population.bbsr>=100000&df$population.bbsr<200000] <- 2
df$pop3[df$population.bbsr>=200000] <- 3
table(df$pop3)

# min/max by pop group for plot label
df %<>% 
  group_by(pop3) %>% 
  mutate_at(vars(opportunity_KE_pc01_wn), list(pop3_min=min, pop3_max=max)) %>%
  ungroup()

df$pop3_label <- ifelse(df$opportunity_KE_pc01_wn==df$pop3_min | df$opportunity_KE_pc01_wn==df$pop3_max, df$Kommune, NA)

# reset seed for reproducible jittering
set.seed(1115) 

ggplot(df, aes(x=factor(pop3), y=opportunity_KE_pc01_wn, label=pop3_label)) + 
  geom_jitter(alpha=0.5, width=0.2) + 
  geom_text_repel() + 
  xlab("Population") + ylab("Opportunity Index") + 
  scale_x_discrete(labels = c('<100k','100-200k','>200k')) +
  theme_bw()
ggsave("figures/Appendix_Figure_B2a.png", dpi = 600, units = "in")

### Variation within similarly dense kreise ----

df <- df %>%
  mutate(popdens4 = ntile(popdens, 4))
table(df$popdens4)

# min/max by pop group for plot label
df %<>% 
  group_by(popdens4) %>% 
  mutate_at(vars(opportunity_KE_pc01_wn), list(popdens4_min=min, popdens4_max=max)) %>%
  ungroup()

df$popdens4_label <- ifelse(df$opportunity_KE_pc01_wn==df$popdens4_min | df$opportunity_KE_pc01_wn==df$popdens4_max, df$Kommune, NA)

# reset seed for reproducible jittering
set.seed(1115) 

ggplot(df, aes(x=factor(popdens4), y=opportunity_KE_pc01_wn, label=popdens4_label)) + 
  geom_jitter(alpha=0.5, width=0.2) + 
  geom_text_repel() + 
  xlab("Population Density") + ylab("Opportunity Index") + 
  scale_x_discrete(labels = c('Q1','Q2','Q3', 'Q4')) +
  theme_bw()
ggsave("figures/Appendix_Figure_B2b.png", dpi = 600, units = "in")

### Correlation with population change ----

ggplot(df, aes(x = opportunity_KE_pc01_wn, y = pop_dynamic)) + 
  geom_point(shape = 1) +  
  geom_smooth(method = "lm", se = FALSE, color = "black") +  
  xlab("Opportunity") + ylab("Population Change in %, 2010-2015") +
  theme_bw()

ggsave("figures/Appendix_Figure_B3.png", dpi = 600, units = "in")


### Illustration of commuting adjustment ----

# three specific cases
commuting_illu <- df %>% filter(krs==12060 | krs==11000 | krs==9162 | krs==9184 | krs==8335 | krs==8435) %>% select(Kommune, opportunity_KE_pc01, opportunity_KE_pc01_wn, opportunity_KE_pc01_wn, commute_weight, commute_opportunity_KE_pc01)
commuting_illu$x <- c(4,1, 4,1, 4,1)
commuting_illu$y <- c(7, 7,4,4,1,1)

ggplot() + 
  
  geom_circle(data = commuting_illu, mapping = aes(x0 = x,  y0 = y, r = opportunity_KE_pc01)) + 
  geom_circle(data = commuting_illu, mapping = aes(x0 = x,  y0 = y, r = opportunity_KE_pc01_wn), color="red") + 

  geom_segment(data=commuting_illu, aes(x=1, xend=4, y=6.9, yend=6.9), arrow=arrow(), size=1.6) +
  geom_segment(data=commuting_illu, aes(x=4, xend=1, y=7.1, yend=7.1), arrow=arrow(), size=0.02) +
  
  geom_segment(data=commuting_illu, aes(x=1, xend=4, y=3.9, yend=3.9), arrow=arrow(), size=0.02) +
  geom_segment(data=commuting_illu, aes(x=4, xend=1, y=4.1, yend=4.1), arrow=arrow(), size=0.04) +

  geom_segment(data=commuting_illu, aes(x=1, xend=4, y=0.8, yend=0.8), arrow=arrow(), size=2.0) +
  geom_segment(data=commuting_illu, aes(x=4, xend=1, y=1.2, yend=1.2), arrow=arrow(), size=0.055) +
  
  annotate("text", x = 1, y = 7.5, label = "Barnim") +   annotate("text", x = 4, y = 7.5, label = "Berlin") +
  annotate("text", x = 1, y = 4.8, label = "Konstanz") +   annotate("text", x = 4, y = 4.8, label = "Bodenseekreis") +
  annotate("text", x = 1, y = 2.2, label = "München, LK") +   annotate("text", x = 4, y = 2.2, label = "München (Stadt)") +

  xlab("") + ylab("") +
  
  coord_fixed() +
  theme_void()

ggsave("figures/Appendix_Figure_C1.png", dpi = 600, units = "in", bg = "white")


### Correlation with political outcomes ----

df %>% mutate(turnout=voters/eligible_to_vote) %>%
ggplot(aes(x=opportunity_KE_pc01_wn, y=turnout, size=population.bbsr)) +
  geom_point(shape=1) + geom_smooth(method="lm", color="black", se=FALSE) +
  xlab("Opportunity") + ylab("Turnout") +
  theme_bw() + theme(legend.position = "none") 
ggsave(file="figures/Figure_3a.png", dpi = 600, units = "in")

df %>% mutate(center_left = spd + greens) %>%
ggplot(aes(x=opportunity_KE_pc01_wn, y=center_left, size=population.bbsr)) +
  geom_point(shape=1) + geom_smooth(method="lm", color="black", se=FALSE) +
  xlab("Opportunity") + ylab("Center-Left Vote Share") +
  theme_bw() + theme(legend.position = "none") 
ggsave(file="figures/Figure_3b.png", dpi = 600, units = "in")

df %>% 
ggplot(aes(x=opportunity_KE_pc01_wn, y=afd, size=population.bbsr)) +
  geom_point(shape=1) + geom_smooth(method="lm", color="black", se=FALSE) +
  xlab("Opportunity") + ylab("Populist Right Vote Share") +
  theme_bw() + theme(legend.position = "none") 
ggsave(file="figures/Figure_3c.png", dpi = 600, units = "in")


# SENSITIVITY ANALYSIS ----
# PCA with randomly dropping index components using bootstrap

# Re-create original PCAs
original_pca <- prcomp(opm_vars[, 4:ncol(opm_vars)], center = TRUE, scale. = TRUE)

fviz_eig(original_pca, addlabels = TRUE, ylim = c(0, 100))
ggbiplot(original_pca)

# Note: sign indeterminacy of PCA, direction of component can vary between R versions
# in contrast to the main model, here we do not manually flip the sign of PC2 in order to align with bootstrapped version below
# This has no substantive impact, as the component captures the same underlying structure regardless of direction.

original_pc1 <- original_pca$x[, 1]
original_pc2 <- original_pca$x[, 2]

# Bootstrap function to drop 1 or 2 indicators randomly and perform PCA
boot_pca <- function(data, indices) {
  # Randomly decide how many indicators to drop (1 or 2)
  num_drop <- sample(1:2, 1)
  drop_cols <- sample(1:ncol(data), num_drop)
  
  # Create reduced dataset
  reduced_data <- data[, -drop_cols]
  
  # Perform PCA on the reduced dataset and return the first two components
  prcomp(reduced_data, center = TRUE, scale. = TRUE)$x[, 1:2]
}

set.seed(1115)
# Run bootstrap with 100 iterations
boot_results <- boot(
  opm_vars[, 4:ncol(opm_vars)],  
  boot_pca,                      
  R = 100                        
)


num_locations <- nrow(opm_vars[, 4:ncol(opm_vars)])  # Number of locations
bootstrap_results_array <- array(boot_results$t, dim = c(100, num_locations, 2))
# convert to data frame
bootstrap_pc1 <- as.data.frame(t(bootstrap_results_array[, , 1]))
bootstrap_pc2 <- as.data.frame(t(bootstrap_results_array[, , 2]))
# rename columns to reflect bootstrap iterations
colnames(bootstrap_pc1) <- paste0("Iter_", 1:100)
colnames(bootstrap_pc2) <- paste0("Iter_", 1:100)

# linear transformation of scores to avoid inflation of metrics by close-to-zero means
bootstrap_pc1_pos <- bootstrap_pc1+abs(min(bootstrap_pc1))
bootstrap_pc2_pos <- bootstrap_pc2+abs(min(bootstrap_pc2))

# calculate summary stats of bootstrap exercise
bootstrap_summary <- data.frame(
  Location = 1:num_locations,
  PC1_Mean = rowMeans(bootstrap_pc1),
  PC2_Mean = rowMeans(bootstrap_pc2),
  PC1_Mean_Pos = rowMeans(bootstrap_pc1_pos), # means based on transformed scores for CV analysis
  PC2_Mean_Pos = rowMeans(bootstrap_pc2_pos),
  PC1_SD_Pos = apply(bootstrap_pc1_pos, 1, sd),
  PC2_SD_Pos = apply(bootstrap_pc2_pos, 1, sd),
  PC1_CV_Pos = apply(bootstrap_pc1_pos, 1, sd) / (abs(rowMeans(bootstrap_pc1_pos)) + 1e-6), # CV based on transformed scores to avoid inflation of values for close-to-zero means
  PC2_CV_Pos = apply(bootstrap_pc2_pos, 1, sd) / (abs(rowMeans(bootstrap_pc2_pos)) + 1e-6)
)

# calculate 90th percentile as a threshold

pc1_cv_pos_threshold <- quantile(bootstrap_summary$PC1_CV_Pos, 0.9)
pc2_cv_pos_threshold <- quantile(bootstrap_summary$PC2_CV_Pos, 0.9)


# Density plot for PC1 CV
ggplot(bootstrap_summary, aes(x = PC1_CV_Pos)) +
  geom_density(fill = "blue", alpha = 0.4) +
  geom_vline(xintercept = pc1_cv_pos_threshold, color = "red", linetype = "dashed") +
  geom_vline(xintercept = 0.2, color = "red", linetype = "solid") +
  labs(
    title = "Distribution of Coefficient of Variation (PC1)",
    x = "Coefficient of Variation",
    y = "Density"
  ) +
  theme_minimal()
ggsave("figures/Appendix_Figure_B5a.png", dpi = 600, units = "in")


# Density plot for PC2 CV
ggplot(bootstrap_summary, aes(x = PC2_CV_Pos)) +
  geom_density(fill = "green", alpha = 0.4) +
  geom_vline(xintercept = pc2_cv_pos_threshold, color = "red", linetype = "dashed") +
  geom_vline(xintercept = 0.2, color = "red", linetype = "solid") +
  
  labs(
    title = "Distribution of Coefficient of Variation (PC2)",
    x = "Coefficient of Variation",
    y = "Density"
  ) +
  theme_minimal()
ggsave("figures/Appendix_Figure_B5b.png", dpi = 600, units = "in")


# reshape bootstrap data for jitter plots
bootstrap_pc1_long <- bootstrap_pc1 %>%
  mutate(Location = 1:nrow(bootstrap_pc1)) %>%
  pivot_longer(cols = -Location, names_to = "Iteration", values_to = "PC1")

bootstrap_pc2_long <- bootstrap_pc2 %>%
  mutate(Location = 1:nrow(bootstrap_pc2)) %>%
  pivot_longer(cols = -Location, names_to = "Iteration", values_to = "PC2")


# jitter plot for PC1
ggplot(bootstrap_pc1_long, aes(x = Location, y = PC1)) +
  geom_jitter(alpha = 0.1, color = "grey", width = 0.2, height = 0.2) +
  geom_point(data = bootstrap_summary, aes(x = Location, y = PC1_Mean, shape = "Bootstrap Mean"),
             color = "black", size = 3) +
  geom_point(data = data.frame(Location = 1:401, PC1 = original_pc1),
             aes(x = Location, y = PC1, shape = "Original"),
             color = "black", size = 3) +
  scale_shape_manual(
    values = c("Bootstrap Mean" = 4, "Original" = 1),
    name = ""
  ) +
  labs(
    title = "Bootstrap PCA Results for First Component",
    x = "Location",
    y = "PC1 Scores"
  ) +
  theme_minimal() + theme(legend.position = "bottom")
ggsave("figures/Appendix_Figure_B4a.png", dpi = 600, units = "in")

# jitter plot for PC2
ggplot(bootstrap_pc2_long, aes(x = Location, y = PC2)) +
  geom_jitter(alpha = 0.1, color = "grey", width = 0.8, height = 0.5) +
  geom_point(data = bootstrap_summary, aes(x = Location, y = PC2_Mean, shape = "Bootstrap Mean"),
             color = "black", size = 3) +
  geom_point(data = data.frame(Location = 1:401, PC2 = original_pc2),
             aes(x = Location, y = PC2, shape = "Original"),
             color = "black", size = 3) +
  scale_shape_manual(
    values = c("Bootstrap Mean" = 4, "Original" = 1),
    name = ""
  ) +
  labs(
    title = "Bootstrap PCA Results for Second Component",
    x = "Location",
    y = "PC2 Scores"
  ) +
  theme_minimal() + theme(legend.position = "bottom")
ggsave("figures/Appendix_Figure_B4b.png", dpi = 600, units = "in")



